EPPS 6302 Methods of Data Collection and Production

Assignment 3: Text Analytics using “quanteda”

  • This assignment involves analyzing a dataset of U.S. presidential inaugural speeches using the “quanteda” package, which provides effective tools for visualizing and analyzing text data.
    • Word Network: A word network is created to visualize relationships between words within the speeches, with network graphs illustrating how different terms are interconnected.

    • Word Clouds: Word clouds are generated for each president to visualize prominent vocabulary and linguistic characteristics in their speeches, with Trump's vocabulary, for instance, showing greater diversity on certain American themes.

    • Word Frequency: Word frequency analysis shows that Trump and Clinton share nearly equal relative frequencies for some words, highlighting common themes.

    • Latent Semantic Analysis (LSA): LSA uncovers semantic relationships between key topics and related terms within speeches, helping to understand evolving themes or core messages across different presidents’ speeches.

    • Relative Frequency Visualization: Visualizing the relative frequency of specific terms (e.g., “American”) reveals how the importance of certain words shifts over time, reflecting changes in thematic focus or societal priorities in presidential speeches.

# Sample program for using quanteda for text modeling and analysis
# Documentation: vignette("quickstart", package = "quanteda")
# Website: https://quanteda.io/

library(quanteda)
Warning: package 'quanteda' was built under R version 4.3.3
Warning in .recacheSubclasses(def@className, def, env): undefined subclass
"ndiMatrix" of class "replValueSp"; definition not updated
Package version: 4.1.0
Unicode version: 14.0
ICU version: 71.1
Parallel computing: disabled
See https://quanteda.io for tutorials and examples.
library(quanteda.textmodels)
Warning: package 'quanteda.textmodels' was built under R version 4.3.3
library(quanteda.textplots)
Warning: package 'quanteda.textplots' was built under R version 4.3.3
library(readr)
library(ggplot2)
# Twitter data about President Biden and Xi summit in Novemeber 2021
# Do some background search/study on the event
# 
summit <- read_csv("https://raw.githubusercontent.com/datageneration/datamethods/master/textanalytics/summit_11162021.csv")
Rows: 14520 Columns: 90
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (50): screen_name, text, source, reply_to_screen_name, hashtags, symbol...
dbl  (26): user_id, status_id, display_text_width, reply_to_status_id, reply...
lgl  (10): is_quote, is_retweet, quote_count, reply_count, ext_media_type, q...
dttm  (4): created_at, quoted_created_at, retweet_created_at, account_create...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sum_twt = summit$text
toks = tokens(sum_twt)
sumtwtdfm <- dfm(toks)
class(toks)
[1] "tokens"
# Latent Semantic Analysis 
## (https://quanteda.io/reference/textmodel_lsa.html)

sum_lsa <- textmodel_lsa(sumtwtdfm, nd=4,  margin = c("both", "documents", "features"))
summary(sum_lsa)
                Length    Class     Mode   
sk                      4 -none-    numeric
docs                58080 -none-    numeric
features            63972 -none-    numeric
matrix_low_rank 232218360 -none-    numeric
data            232218360 dgCMatrix S4     
head(sum_lsa$docs)
              [,1]          [,2]          [,3]          [,4]
text1 8.670375e-03  9.539431e-03 -3.365261e-03  1.378640e-02
text2 8.662406e-06 -8.754517e-06 -6.159723e-06  1.673892e-05
text3 2.917454e-03  6.809891e-03  1.059921e-03 -3.180288e-03
text4 1.046103e-02  8.782783e-04 -4.359418e-03  4.941183e-03
text5 3.247147e-03  8.006068e-03  1.632191e-04 -4.657788e-03
text6 3.247147e-03  8.006068e-03  1.632191e-04 -4.657788e-03
class(sum_lsa)
[1] "textmodel_lsa"
tweet_dfm <- tokens(sum_twt, remove_punct = TRUE) %>%
  dfm()
head(tweet_dfm)
Document-feature matrix of: 6 documents, 15,927 features (99.89% sparse) and 0 docvars.
       features
docs    breaking news us president biden amp communist china leader xi
  text1        1    1  1         1     1   1         1     2      1  1
  text2        0    0  0         0     0   0         0     0      0  0
  text3        0    0  0         0     1   0         0     0      0  1
  text4        0    0  0         1     1   0         0     0      0  1
  text5        0    0  0         0     1   0         0     0      0  1
  text6        0    0  0         0     1   0         0     0      0  1
[ reached max_nfeat ... 15,917 more features ]
tag_dfm <- dfm_select(tweet_dfm, pattern = "#*")
toptag <- names(topfeatures(tag_dfm, 50))
head(toptag, 10)
 [1] "#china"       "#biden"       "#xijinping"   "#joebiden"    "#america"    
 [6] "#americans"   "#coronavirus" "#fentanyl"    "#xi"          "#us"         
library("quanteda.textplots")
tag_fcm <- fcm(tag_dfm)
head(tag_fcm)
Feature co-occurrence matrix of: 6 by 665 features.
               features
features        #breaking #breakingnews #biden #china #usa #pray4america
  #breaking             0             4      5      5    5             0
  #breakingnews         0             0      4      5    4             0
  #biden                0             0      0    443   49             0
  #china                0             0      0      8   76             0
  #usa                  0             0      0      0    6             0
  #pray4america         0             0      0      0    0             0
               features
features        #joebiden #xijinping #america #americans
  #breaking             0          0        0          0
  #breakingnews         0          0        0          0
  #biden              299        370      302        295
  #china              339        434      308        295
  #usa                 12         15        0          0
  #pray4america         0          0        0          0
[ reached max_nfeat ... 655 more features ]
topgat_fcm <- fcm_select(tag_fcm, pattern = toptag)
textplot_network(topgat_fcm, min_freq = 50, edge_alpha = 0.8, edge_size = 1)

user_dfm <- dfm_select(tweet_dfm, pattern = "@*")
topuser <- names(topfeatures(user_dfm, 50))
head(topuser, 20)
 [1] "@potus"           "@politico"        "@joebiden"        "@jendeben"       
 [5] "@eneskanter"      "@nwadhams"        "@phelimkine"      "@nahaltoosi"     
 [9] "@nba"             "@washwizards"     "@pelicansnba"     "@capitalonearena"
[13] "@kevinliptakcnn"  "@foxbusiness"     "@morningsmaria"   "@scmpnews"       
[17] "@petermartin_pcm" "@nytimes"         "@uyghur_american" "@kaylatausche"   
user_fcm <- fcm(user_dfm)
head(user_fcm, 20)
Feature co-occurrence matrix of: 20 by 711 features.
                 features
features          @youtube @bfmtv @cnn @lauhaim @barackobama @joebiden
  @youtube               0      0    0        0            0         0
  @bfmtv                 0      0    1        1            1         1
  @cnn                   0      0    0        1            1         1
  @lauhaim               0      0    0        0            1         1
  @barackobama           0      0    0        0            0         1
  @joebiden              0      0    0        0            0         3
  @kamalaharris          0      0    0        0            0         0
  @hillaryclinton        0      0    0        0            0         0
  @billclinton           0      0    0        0            0         0
  @cbsnews               0      0    0        0            0         0
                 features
features          @kamalaharris @hillaryclinton @billclinton @cbsnews
  @youtube                    0               0            0        0
  @bfmtv                      1               1            1        1
  @cnn                        1               1            1        1
  @lauhaim                    1               1            1        1
  @barackobama                1               1            1        1
  @joebiden                   1               1            1        1
  @kamalaharris               0               1            1        1
  @hillaryclinton             0               0            1        1
  @billclinton                0               0            0        1
  @cbsnews                    0               0            0        0
[ reached max_feat ... 10 more features, reached max_nfeat ... 701 more features ]
user_fcm <- fcm_select(user_fcm, pattern = topuser)
textplot_network(user_fcm, min_freq = 20, edge_color = "firebrick", edge_alpha = 0.8, edge_size = 1)

library(quanteda)
library(quanteda.textmodels)
library(quanteda.textplots)
library(readr)
library(ggplot2)

# Wordcloud
# based on US presidential inaugural address texts, and metadata (for the corpus), from 1789 to present.
dfm_inaug <- corpus_subset(data_corpus_inaugural, Year <= 1826) %>% 
  tokens(remove_punct = TRUE) %>% 
  tokens_remove(stopwords('english')) %>% 
  dfm() %>%
  dfm_trim(min_termfreq = 10, verbose = FALSE)

set.seed(100)
textplot_wordcloud(dfm_inaug)

inaug_speech = data_corpus_inaugural

corpus_subset(data_corpus_inaugural, 
              President %in% c("Trump", "Obama", "Bush")) %>%
  tokens(remove_punct = TRUE) %>%
  tokens_remove(stopwords("english")) %>%
  dfm() %>%
  dfm_group(groups = President) %>%
  dfm_trim(min_termfreq = 5, verbose = FALSE) %>%
  textplot_wordcloud(comparison = TRUE)
Warning in wordcloud_comparison(x, min_size, max_size, min_count, max_words, :
throughout could not be fit on page. It will not be plotted.
Warning in wordcloud_comparison(x, min_size, max_size, min_count, max_words, :
children could not be fit on page. It will not be plotted.

textplot_wordcloud(dfm_inaug, min_count = 10,
                   color = c('red', 'pink', 'green', 'purple', 'orange', 'blue'))

data_corpus_inaugural_subset <- 
  corpus_subset(data_corpus_inaugural, Year > 1949)
kwic(tokens(data_corpus_inaugural_subset), pattern = "american") %>%
  textplot_xray()

textplot_xray(
  kwic(tokens(data_corpus_inaugural_subset), pattern = "american"),
  kwic(tokens(data_corpus_inaugural_subset), pattern = "people"),
  kwic(tokens(data_corpus_inaugural_subset), pattern = "communist")
  
)

## Why is the "communist" plot missing?

theme_set(theme_bw())
g <- textplot_xray(
  kwic(tokens(data_corpus_inaugural_subset), pattern = "american"),
  kwic(tokens(data_corpus_inaugural_subset), pattern = "people"),
  kwic(tokens(data_corpus_inaugural_subset), pattern = "communist")
)
g + aes(color = keyword) + 
  scale_color_manual(values = c("blue", "red", "green")) +
  theme(legend.position = "none")

library(quanteda.textstats)
Warning: package 'quanteda.textstats' was built under R version 4.3.3
Warning in .recacheSubclasses(def@className, def, env): undefined subclass
"ndiMatrix" of class "replValueSp"; definition not updated
features_dfm_inaug <- textstat_frequency(dfm_inaug, n = 100)

# Sort by reverse frequency order
features_dfm_inaug$feature <- with(features_dfm_inaug, reorder(feature, -frequency))

ggplot(features_dfm_inaug, aes(x = feature, y = frequency)) +
  geom_point() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

# Get frequency grouped by president
freq_grouped <- textstat_frequency(dfm(tokens(data_corpus_inaugural_subset)), 
                                   groups = data_corpus_inaugural_subset$President)

# Filter the term "american"
freq_american <- subset(freq_grouped, freq_grouped$feature %in% "american")  

ggplot(freq_american, aes(x = group, y = frequency)) +
  geom_point() + 
  scale_y_continuous(limits = c(0, 14), breaks = c(seq(0, 14, 2))) +
  xlab(NULL) + 
  ylab("Frequency") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

dfm_rel_freq <- dfm_weight(dfm(tokens(data_corpus_inaugural_subset)), scheme = "prop") * 100
head(dfm_rel_freq)
Document-feature matrix of: 6 documents, 4,346 features (85.57% sparse) and 4 docvars.
                 features
docs                      my    friends        ,    before          i
  1953-Eisenhower 0.14582574 0.14582574 4.593511 0.1822822 0.10936930
  1957-Eisenhower 0.20975354 0.10487677 6.345045 0.1573152 0.05243838
  1961-Kennedy    0.19467878 0.06489293 5.451006 0.1297859 0.32446463
  1965-Johnson    0.17543860 0.05847953 5.555556 0.2339181 0.87719298
  1969-Nixon      0.28973510 0          5.546358 0.1241722 0.86920530
  1973-Nixon      0.05012531 0.05012531 4.812030 0.2005013 0.60150376
                 features
docs                   begin      the expression       of     those
  1953-Eisenhower 0.03645643 6.234050 0.03645643 5.176814 0.1458257
  1957-Eisenhower 0          5.977976 0          5.034085 0.1573152
  1961-Kennedy    0.19467878 5.580792 0          4.218040 0.4542505
  1965-Johnson    0          4.502924 0          3.333333 0.1754386
  1969-Nixon      0          5.629139 0          3.890728 0.4552980
  1973-Nixon      0          4.160401 0          3.408521 0.3007519
[ reached max_nfeat ... 4,336 more features ]
rel_freq <- textstat_frequency(dfm_rel_freq, groups = dfm_rel_freq$President)

# Filter the term "american"
rel_freq_american <- subset(rel_freq, feature %in% "american")  

ggplot(rel_freq_american, aes(x = group, y = frequency)) +
  geom_point() + 
  scale_y_continuous(limits = c(0, 0.7), breaks = c(seq(0, 0.7, 0.1))) +
  xlab(NULL) + 
  ylab("Relative frequency") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

dfm_weight_pres <- data_corpus_inaugural %>%
  corpus_subset(Year > 2000) %>%
  tokens(remove_punct = TRUE) %>%
  tokens_remove(stopwords("english")) %>%
  dfm() %>%
  dfm_weight(scheme = "prop")

# Calculate relative frequency by president
freq_weight <- textstat_frequency(dfm_weight_pres, n = 10, 
                                  groups = dfm_weight_pres$President)

ggplot(data = freq_weight, aes(x = nrow(freq_weight):1, y = frequency)) +
  geom_point() +
  facet_wrap(~ group, scales = "free") +
  coord_flip() +
  scale_x_continuous(breaks = nrow(freq_weight):1,
                     labels = freq_weight$feature) +
  labs(x = NULL, y = "Relative frequency")

# Only select speeches by Obama and Trump
pres_corpus <- corpus_subset(data_corpus_inaugural, 
                             President %in% c("Obama", "Trump"))

# Create a dfm grouped by president
pres_dfm <- tokens(pres_corpus, remove_punct = TRUE) %>%
  tokens_remove(stopwords("english")) %>%
  tokens_group(groups = President) %>%
  dfm()

# Calculate keyness and determine Trump as target group
result_keyness <- textstat_keyness(pres_dfm, target = "Trump")

# Plot estimated word keyness
textplot_keyness(result_keyness) 

# Plot without the reference text (in this case Obama)
textplot_keyness(result_keyness, show_reference = FALSE)

library(quanteda.textmodels)

# Irish budget speeches from 2010 (data from quanteda.textmodels)
# Transform corpus to dfm
data(data_corpus_irishbudget2010, package = "quanteda.textmodels")
ie_dfm <- dfm(tokens(data_corpus_irishbudget2010))

# Set reference scores
refscores <- c(rep(NA, 4), 1, -1, rep(NA, 8))

# Predict Wordscores model
ws <- textmodel_wordscores(ie_dfm, y = refscores, smooth = 1)

# Plot estimated word positions (highlight words and print them in red)
textplot_scale1d(ws,
                 highlighted = c("minister", "have", "our", "budget"), 
                 highlighted_color = "red")

# Get predictions
pred <- predict(ws, se.fit = TRUE)

# Plot estimated document positions and group by "party" variable
textplot_scale1d(pred, margin = "documents",
                 groups = docvars(data_corpus_irishbudget2010, "party"))

# Plot estimated document positions using the LBG transformation and group by "party" variable

pred_lbg <- predict(ws, se.fit = TRUE, rescaling = "lbg")

textplot_scale1d(pred_lbg, margin = "documents",
                 groups = docvars(data_corpus_irishbudget2010, "party"))

# Estimate Wordfish model
library("quanteda.textmodels")
wf <- textmodel_wordfish(dfm(tokens(data_corpus_irishbudget2010)), dir = c(6, 5))

# Plot estimated word positions
textplot_scale1d(wf, margin = "features", 
                 highlighted = c("government", "global", "children", 
                                 "bank", "economy", "the", "citizenship",
                                 "productivity", "deficit"), 
                 highlighted_color = "red")

# Plot estimated document positions
textplot_scale1d(wf, groups = data_corpus_irishbudget2010$party)

# Transform corpus to dfm
ie_dfm <- dfm(tokens(data_corpus_irishbudget2010))

# Run correspondence analysis on dfm
ca <- textmodel_ca(ie_dfm)

# Plot estimated positions and group by party
textplot_scale1d(ca, margin = "documents",
                 groups = docvars(data_corpus_irishbudget2010, "party"))